Goal: can machine learning methods help us to associate metabolites with leaf length?

Previously (script 11b2) I filtered out unnamed metabolites. Here I keep them all. but remove blank samples

library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.0-2
library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
## Loading required package: survey
## Loading required package: grid
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
## 
##     aml
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x dplyr::select() masks MASS::select()
## x tidyr::unpack() masks Matrix::unpack()

get leaflength data

leaflength <- read_csv("../../plant/output/leaf_lengths_metabolite.csv") %>%
  mutate(pot=str_pad(pot, width=3, pad="0"),
         sampleID=str_c("wyo", genotype, pot, sep="_"), 
         group=str_c(soil,genotype,trt,sep="_")) %>%
  select(sampleID, group, leaf_avg_std)  %>%
  filter(!str_detect(group, "BLANK"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   pot = col_double(),
##   soil = col_character(),
##   genotype = col_character(),
##   trt = col_character(),
##   leaf_avg = col_double(),
##   leaf_avg_std = col_double()
## )
length(unique(leaflength$group))
## [1] 4
leaflength %>% arrange(sampleID)

get and wrangle metabolite data

met_raw <-read_csv("../input/metabolites_set1.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   tissue = col_character(),
##   soil = col_character(),
##   genotype = col_character(),
##   autoclave = col_character(),
##   time_point = col_character(),
##   concatenate = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
met <- met_raw %>% 
  mutate(pot=str_pad(pot, width = 3, pad = "0")) %>%
  mutate(sampleID=str_c("wyo", genotype, pot, sep="_")) %>%
  filter(soil!="BLANK") %>%
  select(sampleID, genotype, tissue, sample_mass = `sample_mass mg`, !submission_number:concatenate) %>%
  pivot_longer(!sampleID:sample_mass, names_to = "metabolite", values_to = "met_amount") %>%
  
  #adjust by sample mass
  mutate(met_per_mg=met_amount/sample_mass) %>%
  
  #scale and center
  group_by(metabolite, genotype, tissue) %>%
  mutate(met_per_mg=scale(met_per_mg),
         met_amt=scale(met_amount)
  ) %>% 
  pivot_wider(id_cols = sampleID, 
              names_from = c(tissue, metabolite), 
              values_from = starts_with("met_"),
              names_sep = "_")

met

split this into two data frames, one normalized by tissue amount and one not.

met_per_mg <- met %>% select(sampleID,  starts_with("met_per_mg")) %>%
  as.data.frame() %>% column_to_rownames("sampleID")
met_amt <- met %>% select(sampleID,  starts_with("met_amt")) %>%
  as.data.frame() %>% column_to_rownames("sampleID")

get leaf data order to match

leaflength <- leaflength[match(met$sampleID, leaflength$sampleID),]
leaflength

Calc PCAs:

normalized

leaf

met_per_mg.leaf_PCA <- met_per_mg %>% 
  select(matches("_leaf_")) %>%
  prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.leaf_PCA)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
tibble(variance=met_per_mg.leaf_PCA$sdev^2, PC=str_c("PC", 
                                                      str_pad(1:length(met_per_mg.leaf_PCA$sdev), width = 2, pad="0"))) %>%
  mutate(percent_var=100*variance/sum(variance),  
         cumulative_var=cumsum(percent_var)) %>%
  magrittr::extract(1:15,) %>%
  ggplot(aes(x=PC, y=percent_var)) +
  geom_col(fill="skyblue") + 
  geom_line(aes(y=cumulative_var), group="") +
  ggtitle("percent variance explained, named, normalized leaf metabolites")

root

met_per_mg.root_PCA <- met_per_mg %>% 
  select(matches("_root_")) %>%
  prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.root_PCA)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
tibble(variance=met_per_mg.root_PCA$sdev^2, PC=str_c("PC", 
                                                      str_pad(1:length(met_per_mg.root_PCA$sdev), width = 2, pad="0"))) %>%
  mutate(percent_var=100*variance/sum(variance),  
         cumulative_var=cumsum(percent_var)) %>%
  magrittr::extract(1:15,) %>%
  ggplot(aes(x=PC, y=percent_var)) +
  geom_col(fill="skyblue") + 
  geom_line(aes(y=cumulative_var), group="") +
  ggtitle("percent variance explained, named, normalized root metabolites")

raw

leaf

met_amt.leaf_PCA <- met_amt %>%
  select(matches("_leaf_")) %>%
  prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.leaf_PCA)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
tibble(variance=met_amt.leaf_PCA$sdev^2, PC=str_c("PC", 
                                                   str_pad(1:length(met_amt.leaf_PCA$sdev), width = 2, pad="0"))) %>%
  mutate(percent_var=100*variance/sum(variance),  
         cumulative_var=cumsum(percent_var)) %>%
  magrittr::extract(1:15,) %>%
  ggplot(aes(x=PC, y=percent_var)) +
  geom_col(fill="skyblue") + 
  geom_line(aes(y=cumulative_var), group="") +
  ggtitle("percent variance explained, named, raw leaf metabolites")

root

met_amt.root_PCA <- met_amt %>%
  select(matches("_root_")) %>%
  prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.root_PCA)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
tibble(variance=met_amt.root_PCA$sdev^2, PC=str_c("PC", 
                                                   str_pad(1:length(met_amt.root_PCA$sdev), width = 2, pad="0"))) %>%
  mutate(percent_var=100*variance/sum(variance),  
         cumulative_var=cumsum(percent_var)) %>%
  magrittr::extract(1:15,) %>%
  ggplot(aes(x=PC, y=percent_var)) +
  geom_col(fill="skyblue") + 
  geom_line(aes(y=cumulative_var), group="") +
  ggtitle("percent variance explained, named, raw root metabolites")

now try these in a penalized regression

normalized

are the PCs normalized?

colMeans(met_amt.leaf_PCA$x) %>% round(3) #yes centered
##  PC1  PC2  PC3  PC4  PC5  PC6  PC7  PC8  PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
## PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24 
##    0    0    0    0    0    0    0    0
apply(met_amt.leaf_PCA$x, 2, sd) %>% round(2) #not scaled
##   PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8   PC9  PC10  PC11  PC12  PC13 
## 11.89  9.03  6.73  6.47  5.80  5.78  5.37  5.08  4.58  4.40  4.36  4.35  4.03 
##  PC14  PC15  PC16  PC17  PC18  PC19  PC20  PC21  PC22  PC23  PC24 
##  3.83  3.76  3.64  3.55  3.46  3.30  3.19  3.14  3.06  0.00  0.00

combine the leaf and root, and then scale them:

met_per_mg.leaf_PCs <- met_per_mg.leaf_PCA$x
colnames(met_per_mg.leaf_PCs) <- str_c("leaf_", colnames(met_per_mg.leaf_PCs))

met_per_mg.root_PCs <- met_per_mg.root_PCA$x
colnames(met_per_mg.root_PCs) <- str_c("root_", colnames(met_per_mg.root_PCs))

met_per_mg.PCs <- cbind(met_per_mg.leaf_PCs, met_per_mg.root_PCs) %>%
  scale()

met_amt.leaf_PCs <- met_amt.leaf_PCA$x
colnames(met_amt.leaf_PCs) <- str_c("leaf_", colnames(met_amt.leaf_PCs))

met_amt.root_PCs <- met_amt.root_PCA$x
colnames(met_amt.root_PCs) <- str_c("root_", colnames(met_amt.root_PCs))

met_amt.PCs <- cbind(met_amt.leaf_PCs, met_amt.root_PCs) %>%
  scale()

also combine the rotations

met_per_mg.leaf_rotation <- met_per_mg.leaf_PCA$rotation %>%
  as.data.frame() %>% 
  rename_with(~ str_c("leaf_", .x)) %>%
  rownames_to_column("metabolite")

met_per_mg.root_rotation <- met_per_mg.root_PCA$rotation %>%
  as.data.frame() %>% 
  rename_with(~ str_c("root_", .x)) %>%
  rownames_to_column("metabolite")

met_per_mg.PC_rotation <- full_join(met_per_mg.leaf_rotation, met_per_mg.root_rotation, by="metabolite")

met_amt.leaf_rotation <- met_amt.leaf_PCA$rotation %>% 
  as.data.frame() %>% 
  rename_with(~ str_c("leaf_", .x)) %>%
  rownames_to_column("metabolite")

met_amt.root_rotation <- met_amt.root_PCA$rotation %>%
  as.data.frame() %>% 
  rename_with(~ str_c("root_", .x)) %>%
  rownames_to_column("metabolite")

met_amt.PC_rotation <- full_join(met_amt.leaf_rotation, met_amt.root_rotation, by="metabolite")
met_per_mg_fit1LOO <- cv.glmnet(x=met_per_mg.PCs, y=leaflength$leaf_avg_std, nfolds = nrow(met_per_mg.PCs), alpha=1 )
## Warning: Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per
## fold
plot(met_per_mg_fit1LOO)

bestlam=met_per_mg_fit1LOO$lambda.1se

NEXT STEP: Do a K-fold CV, repeat many times and average. Might as well do alpha while we are at it. If we are doing alpha, then we need to manually create our own folds list for each run

normalized

multi CV

Fit 101 CVs for each of 11 alphas

set.seed(1245)

folds <- tibble(run=1:101) %>% 
  mutate(folds=map(run, ~ sample(rep(1:4,6))))

system.time (met_per_mg_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
               left_join(folds, by="run") %>%
               mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=met_per_mg.PCs, y=leaflength$leaf_avg_std, foldid = .x, alpha=.y
                                                         )))
             #, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
##    user  system elapsed 
##  47.257   1.434  52.780
head(met_per_mg_multiCV)

for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda

met_per_mg_multiCV <- met_per_mg_multiCV %>%
  mutate(cvm=map(fit, magrittr::extract("cvm")),
         lambda=map(fit, magrittr::extract("lambda")),
         lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
         lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
         nzero=map(fit, magrittr::extract("nzero"))
  )

head(met_per_mg_multiCV)

now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works

met_per_mg_summary_cvm <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds) %>% 
  unnest(c(cvm, lambda)) %>%
  group_by(alpha, lambda) %>%
  summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
## `summarise()` regrouping output by 'alpha' (override with `.groups` argument)
met_per_mg_summary_cvm
met_per_mg_summary_lambda <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>% 
  group_by(alpha) %>%
  summarize(
    lambda.min.sd=sd(lambda.min), 
    lambda.min.mean=mean(lambda.min),
    #lambda.min.med=median(lambda.min), 
    lambda.min.high=lambda.min.mean+lambda.min.sd,
    #lambda.min.low=lambda.min.mean-lambda.min.sem,
    #lambda.1se.sem=sd(lambda.1se)/sqrt(n()), 
    lambda.1se.mean=mean(lambda.1se),
    #lambda.1se.med=median(lambda.1se), 
    #lambda.1se.high=lambda.1se+lambda.1se.sem,
    #lambda.1se.low=lambda.1se-lambda.1se.sem,
    nzero=nzero[1],
    lambda=lambda[1]
  )
## `summarise()` ungrouping output (override with `.groups` argument)
met_per_mg_summary_lambda

plot it

met_per_mg_summary_cvm %>%
  #filter(alpha!=0) %>% # worse than everything else and throwing the plots off
  ggplot(aes(x=log(lambda), y= meancvm,  ymin=low, ymax=high)) +
  geom_ribbon(alpha=.25) +
  geom_line(aes(color=as.character(alpha))) +
  facet_wrap(~ as.character(alpha)) +
   coord_cartesian(xlim=(c(-5,0))) +
  geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_per_mg_summary_lambda) +
  geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_per_mg_summary_lambda, color="blue") 

So overall these look more reasonable than the LOO plot.

Make a plot of MSE at minimum lambda for each alpha

met_per_mg_summary_cvm %>% 
  group_by(alpha) %>%
  filter(rank(meancvm, ties.method = "first")==1) %>%
  ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
  geom_ribbon(color=NA, fill="gray80") +
  geom_line() +
  geom_point()

not a particular large difference there, aside from 0.1 and even then, not too much better.

Plot the number of nzero coefficients

met_per_mg_summary_lambda %>%
  unnest(c(lambda, nzero)) %>%
  group_by(alpha) %>%
  filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda))  ) %>%
  ungroup() %>%

ggplot(aes(x=as.character(alpha), y=nzero)) +
  geom_point() +
  ggtitle("Number of non-zero coefficents at minimum lambda") +
  ylim(0,24)
## Warning: Removed 1 rows containing missing values (geom_point).

OK let’s do repeated test train starting from these CV lambdas

multi_tt <- function(lambda, alpha, n=10000, sample_size=24, train_size=20, x, y=leaflength$leaf_avg_std) {
  print(lambda)
  print(alpha)
tt <-
  tibble(run=1:n) %>%
  mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
  mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
  
  mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
  mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y])  )) %>%
  mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
  summarize(
    num_na=sum(is.na(cor)), 
    num_lt_0=sum(cor<=0, na.rm=TRUE),
    avg_cor=mean(cor, na.rm=TRUE),
    avg_MSE=mean(MSE))
tt
}

per_mg_fit_test_train <- met_per_mg_summary_lambda %>% 
  select(alpha, lambda.min.mean)

per_mg_fit_test_train <- met_per_mg_multiCV %>%
  filter(run==1) %>%
  select(alpha, fit) %>%
  right_join(per_mg_fit_test_train)
## Joining, by = "alpha"
per_mg_fit_test_train <- per_mg_fit_test_train %>%
  mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=met_per_mg.PCs)),
         full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
         full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
  
  mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=met_per_mg.PCs)))
## [1] 63.76959
## [1] 0
## [1] 3.23426
## [1] 0.1
## [1] 1.96389
## [1] 0.2
## [1] 1.390028
## [1] 0.3
## [1] 1.057486
## [1] 0.4
## [1] 0.8344099
## [1] 0.5
## [1] 0.7009496
## [1] 0.6
## [1] 0.5978767
## [1] 0.7
## [1] 0.527807
## [1] 0.8
## [1] 0.4681233
## [1] 0.9
## [1] 0.4171747
## [1] 1
(per_mg_fit_test_train <- per_mg_fit_test_train %>% unnest(tt))
per_mg_fit_test_train %>%
  ggplot(aes(x=alpha)) +
  geom_line(aes(y=avg_cor), color="red") +
  geom_point(aes(y=avg_cor), color="red") +
  geom_line(aes(y=avg_MSE), color="blue") +
  geom_point(aes(y=avg_MSE), color="blue")

alpha of 0.8 to 1.0 are very similar and are the best here.

look at fit:

alpha_per_mg <- .8

best_per_mg <- per_mg_fit_test_train %>% filter(alpha == alpha_per_mg) 
best_per_mg_fit <- best_per_mg$fit[[1]]
best_per_mg_lambda <- best_per_mg$lambda.min.mean

per_mg_coef.tb <- coef(best_per_mg_fit, s=best_per_mg_lambda) %>% 
  as.matrix() %>% as.data.frame() %>% 
  rownames_to_column(var="PC") %>%
  rename(beta=`1`)
  
per_mg_coef.tb %>% filter(beta!=0) %>% arrange(beta)

pred and obs

plot(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]])

cor.test(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]]) #.57
## 
##  Pearson's product-moment correlation
## 
## data:  leaflength$leaf_avg_std and best_per_mg$pred_full[[1]]
## t = 3.7902, df = 22, p-value = 0.001005
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3015850 0.8231987
## sample estimates:
##       cor 
## 0.6285173
best_per_mg$full_MSE
## [1] 0.8478287

Percent variance explained

per_mg_vars <- per_mg_coef.tb %>% 
  filter(beta !=0, PC!="(Intercept)") %>%
  pull(PC) %>% c("leaf_avg_std", .)

per_mg_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_per_mg.PCs) %>% as.data.frame() %>% dplyr::select(all_of(per_mg_vars)) %>%
  calc.relimp() 

per_mg_coef.tb <- per_mg_relimp@lmg %>% as.matrix() %>% as.data.frame() %>%
  rownames_to_column("PC") %>%
  rename(PropVar_met_per_mg=V1) %>%
  full_join(per_mg_coef.tb) %>%
  arrange(desc(PropVar_met_per_mg))
## Joining, by = "PC"
per_mg_coef.tb

Checkout the rotations.

met_per_mg_rotation_out <- met_per_mg.PC_rotation %>% 
  pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
  filter(PC %in% filter(per_mg_coef.tb, beta!=0)$PC ) %>%
  group_by(PC) %>%
    filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
  filter(abs(loading) >= 0.05) %>%
  left_join(per_mg_coef.tb, by="PC") %>%
  arrange(desc(abs(beta)), desc(abs(loading))) %>%
  mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
         transformation="normalized",
         metabolite=str_remove(metabolite, "met_per_mg_(root|leaf)_"),
         metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_per_mg_rotation_out %>%  write_csv("../output/Leaf_associated_metabolites_normalized_NOBLANK.csv")

non-normazlized

multi CV

Fit 101 CVs for each of 11 alphas

set.seed(1245)

folds <- tibble(run=1:101) %>% 
  mutate(folds=map(run, ~ sample(rep(1:6,4))))

system.time (met_amt_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
               left_join(folds, by="run") %>%
               mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=met_amt.PCs, y=leaflength$leaf_avg_std, foldid = .x, alpha=.y
                                                         )))
             #, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
##    user  system elapsed 
##  68.531   2.282  81.667
head(met_amt_multiCV)

for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda

met_amt_multiCV <- met_amt_multiCV %>%
  mutate(cvm=map(fit, magrittr::extract("cvm")),
         lambda=map(fit, magrittr::extract("lambda")),
         lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
         lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
         nzero=map(fit, magrittr::extract("nzero"))
  )

head(met_amt_multiCV)

now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works

met_amt_summary_cvm <- met_amt_multiCV %>% dplyr::select(-fit, -folds) %>% 
  unnest(c(cvm, lambda)) %>%
  group_by(alpha, lambda) %>%
  summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
## `summarise()` regrouping output by 'alpha' (override with `.groups` argument)
met_amt_summary_cvm
met_amt_summary_lambda <- met_amt_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>% 
  group_by(alpha) %>%
  summarize(
    lambda.min.sd=sd(lambda.min), 
    lambda.min.mean=mean(lambda.min),
    #lambda.min.med=median(lambda.min), 
    lambda.min.high=lambda.min.mean+lambda.min.sd,
    #lambda.min.low=lambda.min.mean-lambda.min.sem,
    #lambda.1se.sem=sd(lambda.1se)/sqrt(n()), 
    lambda.1se.mean=mean(lambda.1se),
    #lambda.1se.med=median(lambda.1se), 
    #lambda.1se.high=lambda.1se+lambda.1se.sem,
    #lambda.1se.low=lambda.1se-lambda.1se.sem,
    nzero=nzero[1],
    lambda=lambda[1]
  )
## `summarise()` ungrouping output (override with `.groups` argument)
met_amt_summary_lambda

plot it

met_amt_summary_cvm %>%
  #filter(alpha!=0) %>% # worse than everything else and throwing the plots off
  ggplot(aes(x=log(lambda), y= meancvm,  ymin=low, ymax=high)) +
  geom_ribbon(alpha=.25) +
  geom_line(aes(color=as.character(alpha))) +
  facet_wrap(~ as.character(alpha)) +
   coord_cartesian(xlim=(c(-5,0))) +
  geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_amt_summary_lambda) +
  geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_amt_summary_lambda, color="blue") 

Make a plot of MSE at minimum lambda for each alpha

met_amt_summary_cvm %>% 
  group_by(alpha) %>%
  filter(rank(meancvm, ties.method = "first")==1) %>%
  ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
  geom_ribbon(color=NA, fill="gray80") +
  geom_line() +
  geom_point()

not a particular large difference here after 0.2

Plot the number of nzero coefficients

met_amt_summary_lambda %>%
  unnest(c(lambda, nzero)) %>%
  group_by(alpha) %>%
  filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda))  ) %>%
  ungroup() %>%

ggplot(aes(x=as.character(alpha), y=nzero)) +
  geom_point() +
  ggtitle("Number of non-zero coefficents at minimum lambda") +
  ylim(0,36)
## Warning: Removed 1 rows containing missing values (geom_point).

OK let’s do repeated test train starting from these CV lambdas

multi_tt <- function(lambda, alpha, n=10000, sample_size=24, train_size=20, x, y=leaflength$leaf_avg_std) {
  print(lambda)
  print(alpha)
tt <-
  tibble(run=1:n) %>%
  mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
  mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
  
  mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
  mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y])  )) %>%
  mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
  summarize(
    num_na=sum(is.na(cor)), 
    num_lt_0=sum(cor<=0, na.rm=TRUE),
    avg_cor=mean(cor, na.rm=TRUE),
    avg_MSE=mean(MSE))
tt
}

amt_fit_test_train <- met_amt_summary_lambda %>% 
  select(alpha, lambda.min.mean)

amt_fit_test_train <- met_amt_multiCV %>%
  filter(run==1) %>%
  select(alpha, fit) %>%
  right_join(amt_fit_test_train)
## Joining, by = "alpha"
amt_fit_test_train <- amt_fit_test_train %>%
  mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=met_amt.PCs)),
         full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
         full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
  
  mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=met_amt.PCs)))
## [1] 129.3889
## [1] 0
## [1] 2.394178
## [1] 0.1
## [1] 1.622926
## [1] 0.2
## [1] 1.129161
## [1] 0.3
## [1] 0.8782994
## [1] 0.4
## [1] 0.709034
## [1] 0.5
## [1] 0.5979773
## [1] 0.6
## [1] 0.5183774
## [1] 0.7
## [1] 0.4567253
## [1] 0.8
## [1] 0.4084716
## [1] 0.9
## [1] 0.3686524
## [1] 1
(amt_fit_test_train <- amt_fit_test_train %>% unnest(tt))
amt_fit_test_train %>%
  ggplot(aes(x=alpha)) +
  geom_line(aes(y=avg_cor), color="red") +
  geom_point(aes(y=avg_cor), color="red") +
  geom_line(aes(y=avg_MSE), color="blue") +
  geom_point(aes(y=avg_MSE), color="blue")

alpha of 0.8 to 1.0 are very similar and are the best here.

look at fit:

alpha_amt <- .8

best_amt <- amt_fit_test_train %>% filter(alpha == alpha_amt) 
best_amt_fit <- best_amt$fit[[1]]
best_amt_lambda <- best_amt$lambda.min.mean

amt_coef.tb <- coef(best_amt_fit, s=best_amt_lambda) %>% 
  as.matrix() %>% as.data.frame() %>% 
  rownames_to_column(var="PC") %>%
  rename(beta=`1`)
  
amt_coef.tb %>% filter(beta!=0) %>% arrange(beta)

pred and obs

plot(leaflength$leaf_avg_std, best_amt$pred_full[[1]])

cor.test(leaflength$leaf_avg_std, best_amt$pred_full[[1]]) #.64
## 
##  Pearson's product-moment correlation
## 
## data:  leaflength$leaf_avg_std and best_amt$pred_full[[1]]
## t = 3.8913, df = 22, p-value = 0.0007859
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3166703 0.8285020
## sample estimates:
##       cor 
## 0.6385023
best_amt$full_MSE
## [1] 0.6742621

Percent variance explained

amt_vars <- amt_coef.tb %>% 
  filter(beta !=0, PC!="(Intercept)") %>%
  pull(PC) %>% c("leaf_avg_std", .)

# because there is only one explanatory variable, we don't need to (and can't) use relimp
amt_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_amt.PCs) %>% as.data.frame() %>% dplyr::select(all_of(amt_vars)) %>% cor() %>% magrittr::extract(1,2) %>% magrittr::multiply_by(.,.)
amt_relimp <- tibble(PC=str_subset(amt_vars, "PC"),
                     PropVar_met_amt=amt_relimp)


amt_coef.tb <- amt_relimp %>% 
  full_join(amt_coef.tb) %>%
  arrange(desc(PropVar_met_amt))
## Joining, by = "PC"
amt_coef.tb

Checkout the rotations.

met_amt_rotation_out <- met_amt.PC_rotation %>% 
  pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
  filter(PC %in% filter(amt_coef.tb, beta!=0)$PC ) %>%
  group_by(PC) %>%
    filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
  filter(abs(loading) >= 0.05) %>%
  left_join(amt_coef.tb, by="PC") %>%
  arrange(desc(abs(beta)), desc(abs(loading))) %>%
  mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
         transformation="raw",
         metabolite=str_remove(metabolite, "met_amt_(root|leaf)_"),
         metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_amt_rotation_out %>%  write_csv("../output/Leaf_associated_metabolites_raw_NOBLANK.csv")

met_amt_rotation_out